# Import library and data
import pandas as pd
import numpy as np
import os
import argparse
import random
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from collections import Counter
import math
import operator
from scipy.spatial.distance import cosine
import csv
os.getcwd()
data=pd.read_csv('C:/Users/mxm5116/Desktop/Data Mining/iris.csv')
data.head()
# Lets see the visualization of different class
%matplotlib inline
# Let see the sepal pattern
ax = data[data['class'] == 'Iris-setosa'].plot.scatter(x='sepal length in cm', y='sepal width in cm', c='blue', label='Iris-setosa')
ax = data[data['class'] == 'Iris-versicolor'].plot.scatter(x='sepal length in cm', y='sepal width in cm', c='orange', label='Iris-versicolor', ax=ax)
ax = data[data['class'] == 'Iris-virginica'].plot.scatter(x='sepal length in cm', y='sepal width in cm', c='red', label='Iris-virginica', ax=ax)
ax
# Let see the petal pattern
ax = data[data['class'] == 'Iris-setosa'].plot.scatter(x='petal length in cm', y='petal width in cm', c='blue', label='Iris-setosa')
ax = data[data['class'] == 'Iris-versicolor'].plot.scatter(x='petal length in cm', y='petal width in cm', c='orange', label='Iris-versicolor', ax=ax)
ax = data[data['class'] == 'Iris-virginica'].plot.scatter(x='petal length in cm', y='petal width in cm', c='red', label='Iris-virginica', ax=ax)
ax
# create design matrix X and target vector y
X = np.array(data.ix[:, 0:4])
y = np.array(data['class'])
Dev_data_X, \
test_data_X, \
Dev_data_y, \
test_data_y = train_test_split(X, y, test_size=0.40, random_state=42)
print(Dev_data_X)
print(test_data_X)
print(Dev_data_y)
print(test_data_y)
print(Dev_data_X.shape)
print(test_data_X.shape)
print(Dev_data_y.shape)
print(test_data_y.shape)
Calculate accuracy by iterating all of the development data point
Find optimal hyperparameters
* Draw bar charts for accuracy
# Now lets Run KNN with a range of K value
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
params = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21],
'leaf_size':[1,2,3,5,10,15,20,30],
'weights':['uniform', 'distance'],
'algorithm':['auto', 'ball_tree','kd_tree','brute'],
'n_jobs':[-1]}
Dev_model = KNeighborsClassifier(n_jobs=-1)
KNN_model = GridSearchCV(Dev_model, param_grid=params, n_jobs=1)
KNN_model.fit(Dev_data_X,Dev_data_y)
print("Best Hyper Parameters:\n",KNN_model.best_params_)
prediction=KNN_model.predict(test_data_X)
print("Accuracy:",metrics.accuracy_score(prediction,test_data_y))
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction,test_data_y))
# Now Apply euclidean distance in KNN step by step
def train(Dev_data_X, Dev_data_y):
return
def predict(Dev_data_X, Dev_data_y, test_data_X, k):
distances = []
targets = []
for i in range(len(Dev_data_X)):
distances.append([np.sqrt(np.sum(np.square(test_data_X - Dev_data_X[i, :]))), i])
distances = sorted(distances)
# make a list of the k neighbors' targets
for i in range(k):
index = distances[i][1]
targets.append(Dev_data_y[index])
# return most common target
return Counter(targets).most_common(1)[0][0]
def k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, k):
# train on the develop data
train(Dev_data_X, Dev_data_y)
# loop over all observations
predictions = []
for i in range(len(test_data_X)):
predictions.append(predict(Dev_data_X, Dev_data_y, test_data_X[i, :], k))
return np.asarray(predictions)
# Now make the predictions when, k=1
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 1)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy1 {}".format(100*accuracy))
# Now make the predictions when, k=3
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 3)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy3 {}".format(100*accuracy))
# Now make the predictions when, k=5
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 5)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy5 {}".format(100*accuracy))
# Now make the predictions when, k=7
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 5)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy7 {}".format(100*accuracy))
# Now make the predictions when, k=9
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 9)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy9 {}".format(100*accuracy))
# Now make the predictions when, k=11
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 11)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy11 {}".format(100*accuracy))
# Now make the predictions when, k=13
predictions = k_nearest_neighbor(Dev_data_X, Dev_data_y, test_data_X, 13)
# Evaluate accuracy
accuracy = accuracy_score(test_data_y, predictions)
print(" accuracy13 {}".format(100*accuracy))
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
objects=('k=1','k=3','k=5','k=7','k=9','k=11','k=13')
y_pos = np.arange(len(objects))
performance = [98.33,98.33,98.33,98.33,98.33,100.0,100.0]
plt.bar(y_pos, performance, align='center', alpha=0.5,color='green')
plt.xticks(y_pos, objects)
plt.ylabel('Performance')
plt.title('Performance with K value')
plt.show()
# normalizing the data
normalize = (X - X.min()) / (X.max() - X.min())
print(normalize)
Dev_norm_data_X, \
test_norm_data_X, \
Dev_norm_data_y, \
test_norm_data_y = train_test_split(normalize, y, test_size=0.40, random_state=42)
print(Dev_norm_data_X)
print(test_norm_data_X)
print(Dev_norm_data_y)
print(test_norm_data_y)
# Now lets Run KNN with a range of K value and normalized euclidean distance
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
params = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21],
'leaf_size':[1,2,3,5,10,15,20,30],
'weights':['uniform', 'distance'],
'algorithm':['auto', 'ball_tree','kd_tree','brute'],
'n_jobs':[-1]}
Dev_model = KNeighborsClassifier(n_jobs=-1)
KNN_model = GridSearchCV(Dev_model, param_grid=params, n_jobs=1)
KNN_model.fit(Dev_norm_data_X,Dev_norm_data_y)
print("Best Hyper Parameters:\n",KNN_model.best_params_)
prediction=KNN_model.predict(test_norm_data_X)
print("Accuracy:",metrics.accuracy_score(prediction,test_norm_data_y))
print("Confusion Metrix:\n",metrics.confusion_matrix(prediction,test_norm_data_y))
# Now Apply euclidean distance in KNN step by step
def train(Dev_norm_data_X, Dev_norm_data_y):
return
def predict(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X, k):
distances = []
targets = []
for i in range(len(Dev_norm_data_X)):
distances.append([np.sqrt(np.sum(np.square(test_norm_data_X - Dev_norm_data_X[i, :]))), i])
distances = sorted(distances)
# make a list of the k neighbors' targets
for i in range(k):
index = distances[i][1]
targets.append(Dev_norm_data_y[index])
# return most common target
return Counter(targets).most_common(1)[0][0]
def k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X, k):
# train on the develop data
train(Dev_norm_data_X, Dev_norm_data_y)
# loop over all observations
predictions = []
for i in range(len(test_norm_data_X)):
predictions.append(predict(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X[i, :], k))
return np.asarray(predictions)
# Now make the predictions, when, k=1
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,1)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy1 {}".format(100*accuracy))
# Now make the predictions, when, k=3
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,3)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy3 {}".format(100*accuracy))
# Now make the predictions, when, k=5
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,5)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy5 {}".format(100*accuracy))
# Now make the predictions, when, k=7
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,7)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy7 {}".format(100*accuracy))
# Now make the predictions, when, k=9
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,9)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy9 {}".format(100*accuracy))
# Now make the predictions, when, k=11
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,11)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy11 {}".format(100*accuracy))
# Now make the predictions, when, k=13
predictions = k_nearest_neighbor(Dev_norm_data_X, Dev_norm_data_y, test_norm_data_X,13)
# Evaluate accuracy
accuracy = accuracy_score(test_norm_data_y, predictions)
print("accuracy13 {}".format(100*accuracy))
def load_dataset(data, split):
training_set = []
test_set = []
with open(data, 'r') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset)):
for y in range(4):
dataset[x][y] = float(dataset[x][y])
if random.random() < split:
training_set.append(dataset[x])
else:
test_set.append(dataset[x])
return training_set, test_set
def cosine_distance(instance1, instance2):
p1 = instance1[:-1]
p2 = instance2[:-1]
return cosine(p1, p2)
def get_neighbors(training_set, test_instance, k):
distances = []
length = len(test_instance)
for x in range(len(training_set)):
dist = cosine_distance(test_instance, training_set[x])
distances.append((training_set[x], dist))
distances.sort(key=operator.itemgetter(1))
for x in range(k):
neighbors = []
neighbors.append(distances[x][0])
return neighbors
def get_response(neighbors):
class_votes = {}
for x in range(len(neighbors)):
response = neighbors[x][-1]
class_votes[response] = 1
sorted_votes = sorted(class_votes.items(), key=operator.itemgetter(1), reverse=True)
return sorted_votes[0][0]
def get_accuracy(test_set, predictions):
correct = 0
for x in range(len(test_set)):
if test_set[x][-1] == predictions[x]:
correct += 1
return (correct / float(len(test_set))) * 100.0
# prepare data
split = 0.60
training_set, test_set = load_dataset('iris1.csv', split)
print('Train set: ' + repr(len(training_set)))
print('Test set: ' + repr(len(test_set)))
predictions = []
for x in range(len(test_set)):
neighbors = get_neighbors(training_set, test_set[x], 7)
result = get_response(neighbors)
predictions.append(result)
print('> predicted=' + repr(result) + ', actual=' + repr(test_set[x][-1]))
accuracy = get_accuracy(test_set, predictions)
print('Accuracy: ' + repr(accuracy) + '%')
# Lets draw the final accuracy bar chart for different distance metric with best k=7
objects=('Euclidean','Normalized Euclidean','Cosine')
y_pos = np.arange(len(objects))
performance = [98.33,98.33,91.66]
plt.bar(y_pos, performance, align='center', alpha=0.5,color='red')
plt.xticks(y_pos, objects)
plt.ylabel('Performance')
plt.title('Performance of different Distance Metric with best Hyperparameter,k=7 in KNN for Iris Data set')
plt.show()
References:
01. https://www.kaggle.com/mayu0116/hyper-parameters-tuning-of-dtree-rf-svm-knn
02. https://kevinzakka.github.io/2016/07/13/k-nearest-neighbor/
03. https://github.com/dtroupe18/SimpleKNN/blob/master/knn.py